Here I study how biomass density changes across treatments in the PatchSizePilot. In particular, I’m studying how biomass density changes across:

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## here() starts at /Users/ema/github/PatchSizePilot

Import

culture_info = read.csv(here("data", "PatchSizePilot_culture_info.csv"), header = TRUE)
load(here("data", "population", "t0.RData")); t0 = pop_output
load(here("data", "population", "t1.RData")); t1 = pop_output
load(here("data", "population", "t2.RData")); t2 = pop_output
load(here("data", "population", "t3.RData")); t3 = pop_output
load(here("data", "population", "t4.RData")); t4 = pop_output
load(here("data", "population", "t5.RData")); t5 = pop_output
load(here("data", "population", "t6.RData")); t6 = pop_output
load(here("data", "population", "t7.RData")); t7 = pop_output
rm(pop_output)

Tidy

t0$time = NA
t1$time = NA

t0$replicate_video = 1:12 #In t1 I took 12 videos of a single 
t1$replicate_video = 1 #In t1 I took only 1 video/culture
t2$replicate_video = 1 #In t2 I took only 1 video/culture
t3$replicate_video = 1 #In t3 I took only 1 video/culture
t4$replicate_video = 1 #In t4 I took only 1 video/culture
t5$replicate_video = 1 #In t5 I took only 1 video/culture
t6$replicate_video = t6$replicate; t6$replicate = NULL
t7$replicate_video = t7$replicate; t7$replicate = NULL

#Create an elongated version of t0 so that each of the 110 cultures can have 12 video replicates at t0.
elongating_t0 = NULL
for (video in 1:nrow(t0)){
  for (ID in 1:nrow(culture_info)) {
    elongating_t0 = rbind(elongating_t0, t0[video,])
  }
}

ID_vector = rep(1:nrow(culture_info), times = nrow(t0))
elongating_t0$culture_ID = ID_vector

t0 = merge(culture_info,elongating_t0, by="culture_ID")
t1 = merge(culture_info,t1,by="culture_ID")
t2 = merge(culture_info,t2,by="culture_ID")
t3 = merge(culture_info,t3,by="culture_ID")
t4 = merge(culture_info,t4,by="culture_ID")
t5 = merge(culture_info,t5,by="culture_ID")
t6 = merge(culture_info,t6,by="culture_ID")
t7 = merge(culture_info,t7,by="culture_ID")
ds = rbind(t0, t1, t2, t3, t4, t5, t6, t7); rm(elongating_t0, t0, t1, t2, t3, t4, t5, t6, t7)

# Switch from data point to day
ds$day = ds$time_point; ds$time_point = NULL
ds$day[ds$day=="t0"] = "0"
ds$day[ds$day=="t1"] = "4"
ds$day[ds$day=="t2"] = "8"
ds$day[ds$day=="t3"] = "12"
ds$day[ds$day=="t4"] = "16"
ds$day[ds$day=="t5"] = "20"
ds$day[ds$day=="t6"] = "24"
ds$day[ds$day=="t7"] = "28"
ds$day = as.numeric(ds$day)

ds = ds %>% 
  select(culture_ID, patch_size, disturbance, metaecosystem_type, bioarea_per_volume, replicate_video, day, metaecosystem, system_nr, eco_metaeco_type)
ds = ds[, c("culture_ID", "system_nr", "disturbance", "day", "patch_size", "metaecosystem", "metaecosystem_type", "eco_metaeco_type", "replicate_video","bioarea_per_volume")]

ds$eco_metaeco_type = factor(ds$eco_metaeco_type, levels=c('S', 'S (S_S)', 'S (S_L)', 'M', 'M (M_M)', 'L', 'L (L_L)', 'L (S_L)'))
ds_regional = ds %>%
    group_by(culture_ID, system_nr, disturbance, day, patch_size, metaecosystem_type) %>%
    summarise(patch_mean_bioarea_across_videos = mean(bioarea_per_volume)) %>%
    group_by(system_nr, disturbance, day,metaecosystem_type) %>%
    summarise(regional_mean_bioarea = mean(patch_mean_bioarea_across_videos))

Plot

Biomass @ low disturbance

Let’s now save all the plots together in a single image. As the single image is too large for R markdown, it cannot be displayed here.

grid = grid.arrange(p[[1]],p[[3]],p[[5]],p[[2]],p[[4]],p[[6]],
                    ncol=3, nrow=2,
                    top = textGrob("Low disturbance",gp=gpar(fontsize=20,font=3)))
ggsave(here("results", "biomass", "Clean_biomass_low.jpg"), grid, width = 22, height = 13)

Biomass @ high disturbance

Let’s save also here for high disturbance.

biomass_plots("high")
grid = grid.arrange(p[[1]],p[[3]],p[[5]],p[[2]],p[[4]],p[[6]],
             ncol=3, nrow=2,
             top = textGrob("High disturbance",gp=gpar(fontsize=20,font=3)))
ggsave(here("results", "biomass", "Clean_biomass_high.jpg"), grid, width = 22, height = 13)

Mixed effect models

To build the mixed effect models we will use the R package lme4. See page 6 of this PDF to know more about the syntaxis of this package.

To do model diagnostics, I’m going to look at the following two plots (as suggested by Zuur et al. (2009), page 487):

Regional biomass

When modelling the regional biomass, we want to understand if the regional biomass produced by an ecosystem with a small and a large patch (metaecosystem_type = S_L) is lower than the regional biomass produced by an ecosystem with two medium patches (metaecosystem_type = M_M).

Create regional biomass data set

First of all, let’s produce a data set including the regional biomass of all our meta ecosystems. In this data set, not only we want to have the regional biomass of the meta-ecosystems (averaged first across videos and then across patches) but also:

  • We want to include only the meta-ecosystems in which patches had both medium size (metaecosystem_type = M_M) and meta-ecosystems in which patches had one a small size and the other large size (metaecosystem_type = S_L).

  • We want to take off the first time point (day = 0). This is because all the meta-ecosystems had the same regional biomass, which was a construct of our experimental design.

ds_regional = ds_regional %>%
    filter (metaecosystem_type == "M_M" | metaecosystem_type == "S_L", day != 0)

Model selection

First of all, let’s include time as a random variable. Let’s start from the largest mixed effect model makes sense to construct. This will include:

  • As fixed effect: disturbance, metaecosystem type, and their interaction
  • As random effect (random intercept and slope): day and the number of the metaecosystem (system_nr).

We are actually going to construct two full models. One with the correlated and one with the uncorrelated random slopes and intercept, the other without.

#Uncorrelated intercepts and slopes
full_model = lmer(regional_mean_bioarea ~ metaecosystem_type  + disturbance + metaecosystem_type * disturbance + (metaecosystem_type || day) + (disturbance || day) + (metaecosystem_type*disturbance  || day) + (metaecosystem_type || system_nr) + (disturbance || system_nr) + (metaecosystem_type*disturbance  || system_nr), data = ds_regional, REML = FALSE)

#Correlated intercepts and slopes
full_model_correlated = lmer(regional_mean_bioarea ~ metaecosystem_type  + disturbance + metaecosystem_type * disturbance + (metaecosystem_type | day) + (disturbance | day) + (metaecosystem_type*disturbance  | day) + (metaecosystem_type | system_nr) + (disturbance | system_nr) + (metaecosystem_type*disturbance  | system_nr), data = ds_regional, REML = FALSE)

anova(full_model, full_model_correlated)
## Data: ds_regional
## Models:
## full_model_correlated: regional_mean_bioarea ~ metaecosystem_type + disturbance + metaecosystem_type * disturbance + (metaecosystem_type | day) + (disturbance | day) + (metaecosystem_type * disturbance | day) + (metaecosystem_type | system_nr) + (disturbance | system_nr) + (metaecosystem_type * disturbance | system_nr)
## full_model: regional_mean_bioarea ~ metaecosystem_type + disturbance + metaecosystem_type * disturbance + ((1 | day) + (0 + metaecosystem_type | day)) + ((1 | day) + (0 + disturbance | day)) + ((1 | day) + (0 + metaecosystem_type | day) + (0 + disturbance | day) + (0 + metaecosystem_type:disturbance | day)) + ((1 | system_nr) + (0 + metaecosystem_type | system_nr)) + ((1 | system_nr) + (0 + disturbance | system_nr)) + ((1 | system_nr) + (0 + metaecosystem_type | system_nr) + (0 + disturbance | system_nr) + (0 +      metaecosystem_type:disturbance | system_nr))
##                       npar    AIC    BIC  logLik deviance Chisq Df Pr(>Chisq)
## full_model_correlated   37 2221.3 2330.2 -1073.7   2147.3                    
## full_model              55 2257.3 2419.1 -1073.7   2147.3     0 18          1

Although the AIC for the model with correlated slopes is lower, there is not statistical significance between the two models. Let’s then keep as a full model the one with non correlated slopes (full_model).

Should we keep the random effect of the metaecosystem number?

no_system_nr_model = lmer(regional_mean_bioarea ~ metaecosystem_type  + disturbance + metaecosystem_type * disturbance + (metaecosystem_type || day) + (disturbance || day) + (metaecosystem_type*disturbance || day), data = ds_regional, REML = FALSE)

anova(full_model, no_system_nr_model)
## Data: ds_regional
## Models:
## no_system_nr_model: regional_mean_bioarea ~ metaecosystem_type + disturbance + metaecosystem_type * disturbance + ((1 | day) + (0 + metaecosystem_type | day)) + ((1 | day) + (0 + disturbance | day)) + ((1 | day) + (0 + metaecosystem_type | day) + (0 + disturbance | day) + (0 + metaecosystem_type:disturbance | day))
## full_model: regional_mean_bioarea ~ metaecosystem_type + disturbance + metaecosystem_type * disturbance + ((1 | day) + (0 + metaecosystem_type | day)) + ((1 | day) + (0 + disturbance | day)) + ((1 | day) + (0 + metaecosystem_type | day) + (0 + disturbance | day) + (0 + metaecosystem_type:disturbance | day)) + ((1 | system_nr) + (0 + metaecosystem_type | system_nr)) + ((1 | system_nr) + (0 + disturbance | system_nr)) + ((1 | system_nr) + (0 + metaecosystem_type | system_nr) + (0 + disturbance | system_nr) + (0 +      metaecosystem_type:disturbance | system_nr))
##                    npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)
## no_system_nr_model   30 2217.1 2305.4 -1078.5   2157.1                     
## full_model           55 2257.3 2419.1 -1073.7   2147.3 9.7713 25     0.9972

The difference between the two doesn’t seem to be significant. Therefore, let’s throw out the system_nr from our model.

Should we keep the random effects?

fixed_effects_model = lm(regional_mean_bioarea ~ metaecosystem_type  + disturbance, data = ds_regional)

anova(no_system_nr_model, fixed_effects_model)
## Data: ds_regional
## Models:
## fixed_effects_model: regional_mean_bioarea ~ metaecosystem_type + disturbance
## no_system_nr_model: regional_mean_bioarea ~ metaecosystem_type + disturbance + metaecosystem_type * disturbance + ((1 | day) + (0 + metaecosystem_type | day)) + ((1 | day) + (0 + disturbance | day)) + ((1 | day) + (0 + metaecosystem_type | day) + (0 + disturbance | day) + (0 + metaecosystem_type:disturbance | day))
##                     npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)
## fixed_effects_model    4 2363.1 2374.9 -1177.6   2355.1                     
## no_system_nr_model    30 2217.1 2305.4 -1078.5   2157.1 198.02 26  < 2.2e-16
##                        
## fixed_effects_model    
## no_system_nr_model  ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The full model is better, as we expected. Time plays such an important part in explaining the way that biomass changes.

Should we keep the interaction between metaecosystem and disturbance?

no_interaction_model = lmer(regional_mean_bioarea ~ metaecosystem_type + disturbance  + (metaecosystem_type || day) + (disturbance || day), data = ds_regional, REML = FALSE)

anova(no_system_nr_model, no_interaction_model)
## Data: ds_regional
## Models:
## no_interaction_model: regional_mean_bioarea ~ metaecosystem_type + disturbance + ((1 | day) + (0 + metaecosystem_type | day)) + ((1 | day) + (0 + disturbance | day))
## no_system_nr_model: regional_mean_bioarea ~ metaecosystem_type + disturbance + metaecosystem_type * disturbance + ((1 | day) + (0 + metaecosystem_type | day)) + ((1 | day) + (0 + disturbance | day)) + ((1 | day) + (0 + metaecosystem_type | day) + (0 + disturbance | day) + (0 + metaecosystem_type:disturbance | day))
##                      npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)
## no_interaction_model   12 2183.2 2218.4 -1079.6   2159.2                     
## no_system_nr_model     30 2217.1 2305.4 -1078.5   2157.1 2.0436 18          1

There is no statistical significance between the two models. Therefore, let’s drop the interaction between disturbance and meta-ecosystem type.

Should we keep the random slopes?

no_slopes_model = lmer(regional_mean_bioarea ~ metaecosystem_type + disturbance  + (1 | day), data = ds_regional, REML = FALSE)

anova(no_interaction_model, no_slopes_model)
## Data: ds_regional
## Models:
## no_slopes_model: regional_mean_bioarea ~ metaecosystem_type + disturbance + (1 | day)
## no_interaction_model: regional_mean_bioarea ~ metaecosystem_type + disturbance + ((1 | day) + (0 + metaecosystem_type | day)) + ((1 | day) + (0 + disturbance | day))
##                      npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)  
## no_slopes_model         5 2185.8 2200.5 -1087.9   2175.8                       
## no_interaction_model   12 2183.2 2218.4 -1079.6   2159.2 16.598  7    0.02018 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Yes. We should keep the random slopes. Well, it seems like this is our best model then.

Best model

After model selection, we determined that the best model is the following one:

regional_mean_bioarea ~ metaecosystem_type + disturbance + (metaecosystem_type || day) + (disturbance || day)

Let’s call it then best_model to don’t get confused.

best_model = no_interaction_model

Effect size & significance of best model

Let’s now calculate the statistical significance and effect size of the best model. The marginal and conditional r squared are calculated using the package MuMIn. For the coding and interpretation of these r squared check the documentation for the r.squaredGLMM function.

model.null = lmer(regional_mean_bioarea ~ (1 | day), data = ds_regional, REML = FALSE)

r2_best = r.squaredGLMM(best_model)
## Warning: 'r.squaredGLMM' now calculates a revised statistic. See the help page.
r2_best = round(r2_best, digits = 3)

anova = anova(best_model, model.null)
p_best = anova$`Pr(>Chisq)`[2]
p_best = round(p_best, digits = 5)

if (p_best < 0.00001) {
  p_best = "< 0.00001"
}

Effect size & significance of meta-ecosystem type (in the best model)

metaecosystem_type_null = lmer(regional_mean_bioarea ~  disturbance  + (disturbance || day),
                               data = ds_regional, 
                               REML = FALSE, 
                               control = lmerControl(optimizer ='optimx', 
                                                     optCtrl=list(method='L-BFGS-B')))

r2_no_metaecosystem_type = r.squaredGLMM(metaecosystem_type_null)
r2_no_metaecosystem_type = round(r2_no_metaecosystem_type, digits = 3)

anova = anova(best_model, metaecosystem_type_null)
p_no_metaecosystem_type = anova$`Pr(>Chisq)`[2]
p_no_metaecosystem_type = round(p_no_metaecosystem_type, digits = 5)

if (p_no_metaecosystem_type < 0.00001) {
  p_no_metaecosystem_type = "< 0.00001"
}

Effect size & significance of disturbance (in the best model)

disturbance_null = lmer(regional_mean_bioarea ~ metaecosystem_type  + (metaecosystem_type || day), 
                        data = ds_regional, 
                        REML = FALSE,
                        control = lmerControl(optimizer ='optimx', 
                                                     optCtrl=list(method='L-BFGS-B')))

r2_no_disturbance = r.squaredGLMM(disturbance_null)
r2_no_disturbance = round(r2_no_disturbance, digits = 3)

anova = anova(best_model, disturbance_null)
p_no_disturbance = anova$`Pr(>Chisq)`[2]
p_no_disturbance = round(p_no_disturbance, digits = 5)

if (p_no_disturbance < 0.00001) {
  p_no_disturbance = "< 0.00001"
}

The effect size and significance of our results can be found in the following table.

Model Marginal R2 Marginal R2 of the best model explained Conditional R2 Conditional R2 of the best model explained P-value
Best model 0.077 / 0.872 / < 0.00001
Best model without meta-ecosystem type 0.039 0.038 0.746 0.126 < 0.00001
Best model without disturbance 0.06 0.017 0.77 0.102 < 0.00001

This means that meta-ecosystem type should explain more than disturbance regional biomass dynamics. However, the effect size isn’t that high (0.038). Because of this, we thought we should fit a function to the time dynamics of the regional biomass.

Fit function to time dynamics

Appendix: model diagnostics

plot(full_model) 

qqnorm(resid(full_model))

plot(full_model_correlated) 

qqnorm(resid(full_model_correlated))

plot(no_system_nr_model) 

qqnorm(resid(no_system_nr_model))

par(mfrow=c(2,3))
plot(fixed_effects_model, which = 1:5) 
par(mfrow=c(1,1))

plot(no_system_nr_model) 

qqnorm(resid(no_system_nr_model))

plot(no_slopes_model) 

qqnorm(resid(no_slopes_model))

plot(metaecosystem_type_null) 

qqnorm(resid(metaecosystem_type_null))

plot(disturbance_null) 

qqnorm(resid(disturbance_null))

Bibliography

Zuur, Alain F., Elena N. Ieno, Neil Walker, Anatoly A. Saveliev, and Graham M. Smith. 2009. Mixed effects models and extensions in ecology with R. Vol. 36. Statistics for Biology and Health. New York, NY: Springer New York. https://doi.org/10.1007/978-0-387-87458-6.